SHEA Spring 2024 Data Visualization Demo

Matthew Ziegler

2024-03-26

Overview

R Introduction

R Introduction

R Introduction

Getting Started

library(dplyr)
library(ggplot2)
library(gt)

Piping Data

data %>% new_data %>% tables

data %>% different_data %>% figures

Loading our Data

dat_cdiff <- read.csv("/Users/matthewziegler/Documents/GitHub/shea24_demo/HAICViz_-_CDI_20240214.csv") %>%
  janitor::clean_names() %>%
  filter(topic =="Case rates (cases per 100,000)", 
         series =="Community-associated"|series=="Healthcare-associated")
dat_cdiff %>%
  select(year_name, series, value, series, topic) %>%
  slice_sample(n=10) %>%
  gt()
year_name series value topic
2016 Community-associated 67.20 Case rates (cases per 100,000)
2011 Healthcare-associated 92.76 Case rates (cases per 100,000)
2012 Community-associated 52.88 Case rates (cases per 100,000)
2015 Healthcare-associated 82.74 Case rates (cases per 100,000)
2019 Healthcare-associated 57.90 Case rates (cases per 100,000)
2014 Community-associated 57.83 Case rates (cases per 100,000)
2015 Community-associated 65.81 Case rates (cases per 100,000)
2012 Healthcare-associated 92.90 Case rates (cases per 100,000)
2017 Healthcare-associated 66.96 Case rates (cases per 100,000)
2011 Community-associated 48.16 Case rates (cases per 100,000)

Let’s do a simple graph

dat_cdiff_line_plot <- dat_cdiff %>%
  ggplot(aes(x = as.factor(year_name), y = value, group = series)) + 
  geom_line(aes(linetype = series)) +
  labs(title = "Cases by year", y = "CDI cases per 1000 individuals", x = "Year")

Let’s do a simple graph

Let’s do a simple graph

dat_cdiff %>%
  ggplot(aes(x = as.factor(year_name), y = value, fill= series)) + 
  geom_col(position = "dodge") +
  labs(title = "Cases by year", y = "CDI cases per 1000 individuals", x = "Year")

A little more complicated

library(tidyr)
dat_cdiff_cat_plot <- read.csv("/Users/matthewziegler/Documents/GitHub/shea24_demo/HAICViz_-_CDI_20240214.csv") %>%
  janitor::clean_names() %>%
  filter(topic =="Case rates (cases per 100,000)") %>%
  mutate(class = case_when(
    grepl("HA|CA", series) ==TRUE & grepl("years", series) ==TRUE  ~ "age",
    grepl("Male|Female", series) ==TRUE & grepl("HA|CA", series) ==TRUE  ~ "sex",
    grepl("White|Non-white", series) ==TRUE & grepl("HA|CA", series) ==TRUE ~ "race")) %>%
  filter(!is.na(class)) %>%
  separate(series, into = c("onset","group"), sep =" - ") %>%
  rename(description = topic) 

dat_cdiff_cat_plot %>%
  ggplot(aes(x = as.factor(year_name), y = value, 
             group = interaction(group, onset),linetype = onset, col = group,)) + 
  geom_line(lwd =1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 0, hjust=0)) +
  facet_wrap(vars(class)) +
  labs(title = "C.difficile Infection by Year", y = "Case rates (cases per 100,000)", x = "Year") 

A little more complicated

library(gt)

dat_cdiff_cat_plot %>%
  select(year_name, value, description, class, onset, group) %>%
  slice_sample(n=10) %>%
  gt()
year_name value description class onset group
2014 74.73 Case rates (cases per 100,000) sex HA Male
2012 104.10 Case rates (cases per 100,000) sex HA Female
2018 262.40 Case rates (cases per 100,000) age HA 65+ years
2013 83.73 Case rates (cases per 100,000) age HA 45-64 years
2019 224.70 Case rates (cases per 100,000) age HA 65+ years
2013 21.61 Case rates (cases per 100,000) age HA 18-44 years
2019 161.10 Case rates (cases per 100,000) age CA 65+ years
2017 72.21 Case rates (cases per 100,000) race HA White
2015 24.80 Case rates (cases per 100,000) age HA 18-44 years
2017 38.44 Case rates (cases per 100,000) age CA 18-44 years

Highlighting and annotating

library(gghighlight)
dat_mdrgn <- read.csv("/Users/matthewziegler/Documents/GitHub/shea24_demo/HAICViz_-_MuGSI_20240330.csv") %>%
  janitor::clean_names() %>%
  mutate(keep = case_when(
    viewby =="Organism" & series != "All cases"  ~ 1,
    organism =="CRAB" & viewby == "All cases" & topic =="Case Rates" ~ 1,
    TRUE ~ 0
  )) %>%
  filter(keep ==1) %>%
  mutate(series = ifelse(organism =="CRAB","Acinetobacter baumanii", series)) %>%
  select(-organism) %>%
  rename(organism = series)
dat_mdrgn_plot <- dat_mdrgn %>%  
  ggplot(aes(x = as.factor(year_name), y = value, group = organism)) + 
  geom_line(aes(linetype = organism)) +
  theme(axis.text.x = element_text(angle = 75, vjust = 0, hjust=0)) +
  labs(title = "Cases by year - Carbapenem-Resistant GNB",
       y = "Cases per 1000 individuals", x = "Year") +
  gghighlight(organism =="Acinetobacter baumanii") +
  theme_minimal()
year_name topic viewby organism value
2014 Case Rates Organism Enterobacter cloacae 0.47
2020 Case Rates Organism Klebsiella oxytoca 0.11
2015 Case Rates Organism Klebsiella pneumoniae 1.47
2017 Case Rates Organism Enterobacter cloacae 2.68
2012 Case Rates Organism Klebsiella oxytoca 0.03
2015 Case Rates All cases Acinetobacter baumanii 1.10
2019 Case Rates Organism Klebsiella aerogenes 0.46
2017 Case Rates Organism Klebsiella oxytoca 0.14
2013 Case Rates Organism Escherichia coli 0.43
2017 Case Rates All cases Acinetobacter baumanii 0.80

Animated figures

library(magick)
library(gganimate)
library(maps)
library(tidyr)
respi <- read.csv("/Users/matthewziegler/Documents/GitHub/shea24_demo/Outpatient_Respiratory_Illness_Activity_Map_20240401.csv") %>%
  janitor::clean_names() %>%
  mutate(region = tolower(state)) %>%
  separate(activity_level, into=c(NA, "level"), sep = " ") %>%
  mutate(level = as.numeric(level)) %>%
  filter(season =="2022-2023")

states <- map_data("state")
region_dat_respi <- left_join(states, respi, by = "region")
region_dat_respi %>%
  select(region, season, week, level, activity_level_label, long, lat) %>%
  slice_sample(n=10) %>%
  gt()
region season week level activity_level_label long lat
virginia 2022-2023 14 4 Low -76.46695 37.31674
missouri 2022-2023 19 2 Minimal -89.15223 36.67503
north carolina 2022-2023 30 1 Minimal -76.48987 36.54898
virginia 2022-2023 1 10 High -76.26641 37.05891
massachusetts 2022-2023 21 2 Minimal -72.81721 42.00927
massachusetts 2022-2023 14 3 Minimal -70.78893 42.23845
indiana 2022-2023 51 11 Very High -87.16407 37.83813
virginia 2022-2023 50 11 Very High -77.26336 38.58298
florida 2022-2023 25 1 Minimal -86.47652 30.44698
montana 2022-2023 21 1 Minimal -113.93266 45.69911
# gif_a <- region_dat_respi %>%
# ggplot(., aes(long, lat, group = group)) +
#   geom_polygon(aes(fill = level),
#                colour = alpha("white", 1/2), size = 0.05)  +
#   geom_polygon(data = states, colour = "black", fill = NA) +
#   scale_fill_gradientn(colours = terrain.colors(6))  +
#   theme_void() +
#   transition_time(week) +
#   labs(title = 'Respiratory Infection Activity 22-23 Season: Week {frame_time}') +
#   theme_minimal()
# 
# gif_a <- animate(gif_a, width = 700, height = 480)

Animate

# gif_b <- region_dat_respi %>%
#   #filter(!is.na(value)) %>%
#   ggplot(data = ., aes(y = level)) + geom_boxplot() +
#   labs(x = "", title = "National Value") +
#   theme(axis.text.x = element_blank()) +
#   transition_time(week)
#   #enter_fade() +
#   #exit_shrink() +
#   #ease_aes('sine-in-out')
# 
# gif_b <- animate(gif_b, width = 600, height = 480)

Animate

# print(gif_a)
# print(gif_b)
# 
# a_mgif <- image_read(gif_a)
# b_mgif <- image_read(gif_b)
# 
# new_gif <- image_append(c(a_mgif[1], b_mgif[1]))
# for(i in 1:95){
#   combined <- image_append(c(a_mgif[i], b_mgif[i]))
#   new_gif <- c(new_gif, combined)
# }

Plotting Model Output

library(gtsummary)
library(marginaleffects)

latitude_by_states <- states %>%
  group_by(region) %>%
  summarise(mean_lat = mean(lat))

Plotting Model Output

vaccination <- read.csv("/Users/matthewziegler/Documents/GitHub/shea24_demo/Vaccination_Coverage_among_Health_Care_Personnel_20240401.csv") %>%
  janitor::clean_names() %>%
  mutate(year = as.numeric(substr(season,1,4))) %>%
  mutate(region = tolower(geography)) %>%
  left_join(latitude_by_states, by = "region") %>%
  rename(latitude = mean_lat) %>%
  filter(personnel_type != "All Health Care Personnel")
vaccination %>%
  select(vaccine, region, year, personnel_type, estimate, sample_size, latitude) %>%
  slice_sample(n=10) %>%
  gt()
vaccine region year personnel_type estimate sample_size latitude
Seasonal Influenza wisconsin 2013 Adult Students/Trainees and Volunteers 86.6 30351 44.95970
Seasonal Influenza north carolina 2016 Adult Students/Trainees and Volunteers 88.2 36164 35.51161
Seasonal Influenza nebraska 2020 Employees 91.2 40525 41.71496
Seasonal Influenza north dakota 2013 Adult Students/Trainees and Volunteers 90.7 2849 47.43643
Seasonal Influenza louisiana 2017 Licensed Independent Practitioners 70.8 20633 30.44209
Seasonal Influenza maine 2018 Adult Students/Trainees and Volunteers 87.2 3874 44.82937
Seasonal Influenza alaska 2017 Employees 79.6 17105 NA
Seasonal Influenza texas 2018 Licensed Independent Practitioners 70.4 123029 30.08546
Seasonal Influenza tennessee 2016 Licensed Independent Practitioners 75.1 25408 35.88569
Seasonal Influenza connecticut 2020 Adult Students/Trainees and Volunteers 91.2 9643 41.38260

Plotting Model Output

model <- lm(estimate ~ year + latitude + personnel_type, dat = vaccination)

Plotting Model Output

tbl_regression(model)
Characteristic Beta 95% CI1 p-value
year 1.2 0.96, 1.5 <0.001
latitude 0.34 0.22, 0.47 <0.001
personnel_type


    Adult Students/Trainees and Volunteers
    Employees 3.2 1.7, 4.6 <0.001
    Licensed Independent Practitioners -14 -16, -13 <0.001
1 CI = Confidence Interval

Plotting Model Output

plot_predictions(model, condition = "year") +
  labs(y = "Estimated Percentage: Vaccine Compliance", x= "Year", 
       title = "Estimated Vaccine Compliance by Year") +
  theme_minimal()

Plotting Model Output

What Else?

Conclusions

Resources

Citations

#devtools::session_info()

# citation("base")
# citation("ggplot2")
# citation("gt")
# # citation("dplyr")
# citation("tidyr")
# # citation("gghighlight")
# citation("magick")
# # citation("gganimate")
# citation("maps")
# # citation("gtsummary")
# # citation("marginaleffects")